#!/usr/bin/python
# -*- coding: utf-8 -*-

import urllib
import urllib2
from bs4 import BeautifulSoup
import re
import os
import sys
import json
import MySQLdb
import datetime
import time
import random
reload(sys)
sys.setdefaultencoding('utf-8')

class Taobaomm(object):
    def __int__(self):
        self.images_path = "/data/images/"
        self.taobao_url = "https://mm.taobao.com/json/request_top_list.htm"
        self.personal_url = "https://mm.taobao.com/self/info/model_info_show.htm"
        self.album_url = "https://mm.taobao.com/self/album/open_album_list.htm"

        self.mysql_conn = MySQLdb.connect(host="localhost", user="root", passwd="password", port=3306, db='taobao', connect_timeout = 5, charset='utf8')
        self.curs = self.mysql_conn.cursor()

    def getUserList(self, p):
        url = self.taobao_url + "?page=" +  str(p)
        try:
            content = getContent( url )
            soup = BeautifulSoup(content, 'lxml')

            #如果未找到class，跳出循环进行下一次循环
            if soup.select('.list-item'):
                return False

            for item in soup.select('.list-item'):

                #个人URL
                profile_url = "https:"+item.select('.lady-avatar')[0]['href']
                #姓名
                username = item.select(".lady-name")[0].get_text()
                #年龄
                age = item.select("em strong")[0].get_text()
                #地址
                addr = item.select("span")[0].get_text()
                #USERID
                user_id = re.findall(re.compile(r'[1-9]\d*'),profile_url)[0]
                #粉丝数
                fans = item.select("em strong")[1].get_text()

                #访问个人详情页
                personalurl = self.personal_url + "?user_id=" + str(user_id)
                personal_content = getContent( personalurl )
                soup_ = BeautifulSoup( personal_content, 'lxml' )

                birthday_text = soup_.select(".mm-p-base-info ul li")[1].select("span")[0].get_text()
                pattern_int = re.compile(r'\d+')
                year = datetime.datetime.now().strftime("%Y")
                birthday = str( int(year) - int(age) ) + "-" + '-'.join(re.findall(pattern_int,birthday_text))

                city = soup_.select(".mm-p-base-info ul li")[2].select("span")[0].get_text()
                jobs = soup_.select(".mm-p-base-info ul li")[3].select("span")[0].get_text()
                blood_type = soup_.select(".mm-p-base-info ul li")[4].select("span")[0].get_text()
                school = soup_.select(".mm-p-base-info ul li")[5].select("span")[0].get_text()
                style = soup_.select(".mm-p-base-info ul li")[6].select("span")[0].get_text()
                height = soup_.select(".mm-p-base-info ul li")[7].select("p")[0].get_text()
                weight = soup_.select(".mm-p-base-info ul li")[8].select("p")[0].get_text()
                size = soup_.select(".mm-p-base-info ul li")[9].select("p")[0].get_text()
                bar = soup_.select(".mm-p-base-info ul li")[10].select("p")[0].get_text()
                shose = soup_.select(".mm-p-base-info ul li")[11].select("p")[0].get_text()
                experience = soup_.select(".mm-p-experience-info p")[0].get_text()

                param = (int(user_id), username, int(age), birthday, city, int(fans), profile_url, jobs, blood_type, school, style, height, weight, size, bar, shose, experience)
                sql = "insert into mm (user_id, user_name, age, birthday, city, fans, url, jobs, blood_type, school, style, height, weight, size, bar, shose, experience) values ( %d, '%s', %d, '%s', '%s', %d, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s' )" % param

                mysql_execute( sql )

                getPhotoAlbum( user_id, username )

        except Exception,e:
            print e


    #获取用户相册
    def getPhotoAlbum( self, user_id, username ):
        url = self.album_url + "?user_id=" + str( user_id )
        content = getContent( url )
        soup = BeautifulSoup( content, 'lxml' )

        lists = soup.select(".mm-photo-list .mm-photo-cell")
        pattern = re.compile(r'[1-9]\d*')
        for item in lists:
            album_url = item.select(".mm-first")[0]['href']
            id = re.findall(pattern,album_url)
            album_id = id[1]
            album_name = item.select(".mm-photo-cell-middle h4 a")[0].get_text().strip()

            album_path = images_path + username + "/" + album_name
            mkdir( album_path )

            getAlbumPic( user_id, album_id, album_path )

    #获取相册图片
    def getAlbumPic( user_id, album_id, album_path ):
        url = "https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=%s&album_id=%s" % ( str(user_id), str(album_id) )
        for p in range(1,15):
            url_site = url + "&page=" + str(p)
            json_data = getContent( url_site )
            data = json.loads(json_data)
            if data['isError'] == '0' and data['isOK'] == '1':
                for info in data['picList']:
                    pic_id = info['picId']
                    pic_url = "https:"+info['picUrl']
                    saveImage( album_path, pic_id, pic_url )
            else:
                break


    #获取网页内容
    def getContent( url, proxy=None, coding='utf-8' ):
        try:
            user_agent = getUserAgent()
            headers = {"User-Agent": user_agent}

            proxies={"http":"114.244.112.220:8118"}   #设置你想要使用的代理
            proxy_s=urllib2.ProxyHandler(proxies)
            opener=urllib2.build_opener(proxy_s)
            urllib2.install_opener(opener)

            request = urllib2.Request(url, headers)
            response = urllib2.urlopen(request, timeout=5)
            return response.read().decode(coding)
        except Exception, e:
            print e

    #保存网络图片
    def saveImage( album_path, pic_name, pic_url ):
        pic = urllib.urlopen(pic_url).read()
        file = open( album_path + "/" + pic_name + ".jpg", 'wb')
        file.write(pic)
        file.close()

    #保存文件
    def saveFile(file_path, name, content):
        file_name = file_path + "/" + name + '.txt'
        file = open( file_name, "a+" )
        file.write( content.encode('utf-8') )

    #创建目录
    def mkdir(path):
        path = path.strip()
        isExists = os.path.exists( path )
        if not isExists:
            os.makedirs( path )

    def getAgentIp():
        url = "http://www.xicidaili.com/nn/"

        content = getContent( url )
        soup = BeautifulSoup(content, "lxml")
        ip_list = soup.select("#ip_list tr")
        agent_ip = []
        for item in ip_list:
            if len( agent_ip ) < 10:
                if item.select(".country div.bar"):
                    speed_ = item.select(".country div.bar")[0]['title']
                    if speed_:
                        speed = re.findall(re.compile(r'^[0-9]\d*\.\d*|0\.\d*[1-9]\d*$'),speed_)[0]
                        if float(speed) <= 0.5:
                            ip = item.select("td")[1].get_text()+":"+item.select("td")[2].get_text()
                            agent_ip.append(ip)
        return agent_ip

    def getUserAgent():
        user_agent_list = [
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.8 (KHTML, like Gecko) Beamrise/17.2.0.9 Chrome/17.0.939.0 Safari/535.8",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/18.6.872.0 Safari/535.2 UNTRUSTED/1.0 3gpp-gba UNTRUSTED/1.0",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.1) Gecko/20100101 Firefox/10.0.1",
            "Mozilla/5.0 (Windows NT 6.1; rv:12.0) Gecko/20120403211507 Firefox/12.0",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1",
            "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
            "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)",
            "Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0",
            "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.7.62 Version/11.01",
            "Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; Sleipnir/2.9.8)",
            "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.65 Safari/534.24",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.107 Safari/535.1",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.77 Safari/535.7ad-imcjapan-syosyaman-xkgi3lqg03!wgz",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.7xs5D9rRDFpg2g",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.861.0 Safari/535.2",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.113 Safari/534.30",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.12 Safari/534.24",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.669.0 Safari/534.20",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.655.0 Safari/534.17",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.134 Safari/534.16",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.19 Safari/534.13",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10"
        ]
        return random.choice(user_agent_list)

